import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
1.A. Refer above table and find the joint probability of the people who planned to purchase and actually placed an order. [1 Mark]
# People who planned to purchase and actually placed an order = 400
# Total = 2000
output1 = (400/2000)
print('The joint probability of the people who planned to purchase and actually placed an order is %1.4f' % output1)
The joint probability of the people who planned to purchase and actually placed an order is 0.2000
1.B. Refer to the above table and find the joint probability of the people who planned to purchase and actually placed an order, given that people planned to purchase. [1 Mark]
# People who planned to purchase and actually placed an order = 400
# People who planned to purchase = 500
# Total = 2000
output2 = (400/2000) / (500/2000)
print('the joint probability of the people who planned to purchase and actually placed an order, given that people planned to purchase is %1.4f' % output2)
the joint probability of the people who planned to purchase and actually placed an order, given that people planned to purchase is 0.8000
# The failure rate for the manufactured item is 5%
# sample of manufactured items is selected is 10
failure_rate = (5/100)
sample_size = 10
k = np.arange(0, 12)
pmf = stats.binom.pmf(k, sample_size, failure_rate)
pmf
array([5.98736939e-01, 3.15124705e-01, 7.46347985e-02, 1.04750594e-02,
9.64808106e-04, 6.09352488e-05, 2.67259863e-06, 8.03789062e-08,
1.58642578e-09, 1.85546875e-11, 9.76562500e-14, 0.00000000e+00])
cdf = stats.binom.cdf(k, sample_size, failure_rate)
cdf
array([0.59873694, 0.91386164, 0.98849644, 0.9989715 , 0.99993631,
0.99999725, 0.99999992, 1. , 1. , 1. ,
1. , 1. ])
2.A. Probability that none of the items are defective? [1 Mark]
print('Probability that none of the items are defective is %1.4f' % pmf[0])
Probability that none of the items are defective is 0.5987
2.B. Probability that exactly one of the items is defective? [1 Mark]
print('Probability that exactly one of the items is defective is %1.4f' % pmf[1])
Probability that exactly one of the items is defective is 0.3151
2.C. Probability that two or fewer of the items are defective? [1 Mark]
print('Probability that two or fewer of the items are defective is %1.4f' % cdf[2])
Probability that two or fewer of the items are defective is 0.9885
2.D. Probability that three or more of the items are defective ? [1 Mark]
print('Probability that two or fewer of the items are defective is %1.4f' % (1 - cdf[2]))
Probability that two or fewer of the items are defective is 0.0115
plt.plot(k, pmf, 'o-')
plt.title('Binomial Distribution')
plt.xlabel('Number of Defective Items')
plt.ylabel('Probability of Defective Items')
plt.show()
# Sells on an average cars per week = 3
average_policies = 3
k = np.arange(0, 16)
poisson_cdf = stats.poisson.cdf(k, average_policies)
poisson_cdf
array([0.04978707, 0.19914827, 0.42319008, 0.64723189, 0.81526324,
0.91608206, 0.96649146, 0.9880955 , 0.99619701, 0.99889751,
0.99970766, 0.99992861, 0.99998385, 0.9999966 , 0.99999933,
0.99999988])
poisson_pmf = stats.poisson.pmf(k, average_policies)
poisson_pmf
array([4.97870684e-02, 1.49361205e-01, 2.24041808e-01, 2.24041808e-01,
1.68031356e-01, 1.00818813e-01, 5.04094067e-02, 2.16040315e-02,
8.10151179e-03, 2.70050393e-03, 8.10151179e-04, 2.20950322e-04,
5.52375804e-05, 1.27471339e-05, 2.73152870e-06, 5.46305740e-07])
3.A. What is Probability that in a given week he will sell some cars? [1 Mark]
print('Probability that in a given week he will sell some cars is %1.4f' % (1 - poisson_cdf[0]))
Probability that in a given week he will sell some cars is 0.9502
3.B. What is Probability that in a given week he will sell 2 or more but less than 5 cars? [1 Mark]
print('Probability that in a given week he will sell some cars is %1.4f' % (poisson_cdf[4] - poisson_cdf[1]))
Probability that in a given week he will sell some cars is 0.6161
3.C. Plot the poisson distribution function for cumulative probability of cars sold per-week vs number of cars sold per week. [1 Mark]
plt.plot(k, poisson_pmf, 'o-')
plt.title('Poisson Distribution')
plt.xlabel('Number of cars sold per-week')
plt.ylabel('Cumulative Probability of cars sold per-week')
plt.show()
# The percentage of orders that are taken correctly is 86.8%
orders_taken_correctly = (86.8/100)
sample_size = 3
k = np.arange(0, 12)
binomial_pmf = stats.binom.pmf(k, sample_size, orders_taken_correctly)
binomial_pmf
array([0.00229997, 0.0453721 , 0.2983559 , 0.65397203, 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. ])
binomial_cdf = stats.binom.cdf(k, sample_size, orders_taken_correctly)
binomial_cdf
array([0.00229997, 0.04767206, 0.34602797, 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. ])
4.A. What is the probability that all three orders will be recognised correctly? [1 Mark]
print('Probability that all three orders will be recognised correctly is %1.4f' % binomial_pmf[3])
print('In percentage, %1.2f' % (binomial_pmf[3] * 100),'%')
Probability that all three orders will be recognised correctly is 0.6540 In percentage, 65.40 %
4.B. What is the probability that none of the three orders will be recognised correctly? [1 Mark]
print('Probability that none of the three orders will be recognised correctly is %1.4f' % binomial_pmf[0])
print('In percentage, %1.2f' % (binomial_pmf[0] * 100),'%')
Probability that none of the three orders will be recognised correctly is 0.0023 In percentage, 0.23 %
4.C. What is the probability that at least two of the three orders will be recognised correctly? [1 Mark]
print('Probability that at least two of the three orders will be recognised correctly is %1.4f' % (binomial_pmf[2] + binomial_pmf[3]))
print('In percentage, %1.2f' % ((binomial_pmf[2] + binomial_pmf[3]) * 100),'%')
Probability that at least two of the three orders will be recognised correctly is 0.9523 In percentage, 95.23 %
plt.plot(k, binomial_cdf, 'o-')
plt.title('Binomial Distribution')
plt.xlabel('Percentage of orders that are taken correctly')
plt.ylabel('Cumulative Probability of orders recognised correctly')
plt.show()
Examples of Applied Statistics in Real Life
There are a variety of applications used in our daily life that tend to make use of applied statistics and related theories. Some of them are listed below:
Stock Market data analysis Stock market analysis is a classic example of statistical analysis in real life. The investor or the consumer willing to invest in the market tends to take all the available data from the market and perform research and analysis on it with the help of various statistical models to determine the performance portfolio of different investments. This helps the user improve his/her chances of making the most appropriate choice of all the available options. To simplify this process, a variety of software, web pages, and mobile applications have been developed and are available over the internet to educate a person about the working of the stock market and to properly guide him/her throughout the process of making an investment.
Weather Forecasting Weather Forecasting is yet another example of a real-life application that makes use of statistical analysis. Weather forecasting basically depends on predicting the probability of occurrence of a particular event based on a collection of past or historical data. To perform weather forecasting with utmost efficiency, the historical trends related to the weather and climate conditions such as air temperature, pressure value, magnitude of humidity, air quality index, the appearance of clouds, speed and direction of winds, precipitation levels and frequency, etc. are captured in form of sample datasets. The bunch of raw data is then fed to algorithms that perform the necessary computation and analysis to draw out conclusions. Weather forecasting basically falls under the category of inferential statistics.
• DOMAIN: Sports
• CONTEXT: Company X manages the men's top professional basketball division of the American league system. The dataset contains information on all the teams that have participated in all the past tournaments. It has data about how many baskets each team scored, conceded, how many times they came within the first 2 positions, how many tournaments they have qualified, their best position in the past, etc.
• DATA DESCRIPTION: Basketball.csv - The data set contains information on all the teams so far participated in all the past tournaments.
• DATA DICTIONARY:
• PROJECT OBJECTIVE: Company’s management wants to invest on proposals on managing some of the best teams in the league. The analytics department has been assigned with a task of creating a report on the performance shown by the teams. Some of the older teams are already in contract with competitors. Hence Company X wants to understand which teams they can approach which will be a deal win for them.
• STEPS AND TASK [30 Marks]:
# Read the data set
basketball_df = pd.read_csv('Basketball.csv');
basketball_df
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | Runner-up | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team 1 | 86 | 4385 | 2762 | 1647 | 552 | 563 | 5947 | 3140 | 33 | 23 | 1929 | 1 |
| 1 | Team 2 | 86 | 4262 | 2762 | 1581 | 573 | 608 | 5900 | 3114 | 25 | 25 | 1929 | 1 |
| 2 | Team 3 | 80 | 3442 | 2614 | 1241 | 598 | 775 | 4534 | 3309 | 10 | 8 | 1929 | 1 |
| 3 | Team 4 | 82 | 3386 | 2664 | 1187 | 616 | 861 | 4398 | 3469 | 6 | 6 | 1931to32 | 1 |
| 4 | Team 5 | 86 | 3368 | 2762 | 1209 | 633 | 920 | 4631 | 3700 | 8 | 7 | 1929 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 56 | Team 57 | 1 | 34 | 38 | 8 | 10 | 20 | 38 | 66 | - | - | 2009-10 | 20 |
| 57 | Team 58 | 1 | 22 | 30 | 7 | 8 | 15 | 37 | 57 | - | - | 1956-57 | 16 |
| 58 | Team 59 | 1 | 19 | 30 | 7 | 5 | 18 | 51 | 85 | - | - | 1951~52 | 16 |
| 59 | Team 60 | 1 | 14 | 30 | 5 | 4 | 21 | 34 | 65 | - | - | 1955-56 | 15 |
| 60 | Team 61 | 1 | - | - | - | - | - | - | - | - | - | 2017~18 | 9 |
61 rows × 13 columns
# Shape the data set
basketball_df.shape
(61, 13)
# Information on data and datatypes
basketball_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Team 61 non-null object 1 Tournament 61 non-null int64 2 Score 61 non-null object 3 PlayedGames 61 non-null object 4 WonGames 61 non-null object 5 DrawnGames 61 non-null object 6 LostGames 61 non-null object 7 BasketScored 61 non-null object 8 BasketGiven 61 non-null object 9 TournamentChampion 61 non-null object 10 Runner-up 61 non-null object 11 TeamLaunch 61 non-null object 12 HighestPositionHeld 61 non-null int64 dtypes: int64(2), object(11) memory usage: 6.3+ KB
# Clean the data
# Check for null values
basketball_df.isna().sum()
Team 0 Tournament 0 Score 0 PlayedGames 0 WonGames 0 DrawnGames 0 LostGames 0 BasketScored 0 BasketGiven 0 TournamentChampion 0 Runner-up 0 TeamLaunch 0 HighestPositionHeld 0 dtype: int64
No Null values in the data set
# Check for duplicates in data
sum(basketball_df.duplicated())
0
No Duplicate values in the data set
# Replace the Nan values with zero in the dataframe
basketball_df.replace(np.nan, 0, inplace=True)
basketball_df.describe(include = "all").T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Team | 61 | 61 | Team 1 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Tournament | 61.0 | NaN | NaN | NaN | 24.0 | 26.827225 | 1.0 | 4.0 | 12.0 | 38.0 | 86.0 |
| Score | 61 | 61 | 4385 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| PlayedGames | 61 | 53 | 2762 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| WonGames | 61 | 59 | 7 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DrawnGames | 61 | 57 | 14 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| LostGames | 61 | 56 | 37 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| BasketScored | 61 | 60 | 70 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| BasketGiven | 61 | 61 | 3140 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| TournamentChampion | 61 | 8 | - | 52 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Runner-up | 61 | 10 | - | 48 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| TeamLaunch | 61 | 47 | 1929 | 10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| HighestPositionHeld | 61.0 | NaN | NaN | NaN | 7.081967 | 5.276663 | 1.0 | 3.0 | 6.0 | 10.0 | 20.0 |
basketball_df.tail()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | Runner-up | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56 | Team 57 | 1 | 34 | 38 | 8 | 10 | 20 | 38 | 66 | - | - | 2009-10 | 20 |
| 57 | Team 58 | 1 | 22 | 30 | 7 | 8 | 15 | 37 | 57 | - | - | 1956-57 | 16 |
| 58 | Team 59 | 1 | 19 | 30 | 7 | 5 | 18 | 51 | 85 | - | - | 1951~52 | 16 |
| 59 | Team 60 | 1 | 14 | 30 | 5 | 4 | 21 | 34 | 65 | - | - | 1955-56 | 15 |
| 60 | Team 61 | 1 | - | - | - | - | - | - | - | - | - | 2017~18 | 9 |
# Check for unique values for all the columns
print("Team", basketball_df['Team'].unique())
print('-----------------------------------------------------------------------------------');
print("Tournament", basketball_df['Tournament'].unique())
print('-----------------------------------------------------------------------------------');
print("Score", basketball_df['Score'].unique())
print('-----------------------------------------------------------------------------------');
print("PlayedGames", basketball_df['PlayedGames'].unique())
print('-----------------------------------------------------------------------------------');
print("WonGames", basketball_df['WonGames'].unique())
print('-----------------------------------------------------------------------------------');
print("DrawnGames", basketball_df['DrawnGames'].unique())
print('-----------------------------------------------------------------------------------');
print("LostGames", basketball_df['LostGames'].unique())
print('-----------------------------------------------------------------------------------');
print("BasketScored", basketball_df['BasketScored'].unique())
print('-----------------------------------------------------------------------------------');
print("BasketGiven", basketball_df['BasketGiven'].unique())
print('-----------------------------------------------------------------------------------');
print("TournamentChampion", basketball_df['TournamentChampion'].unique())
print('-----------------------------------------------------------------------------------');
print("Runner-up", basketball_df['Runner-up'].unique())
print('-----------------------------------------------------------------------------------');
print("TeamLaunch", basketball_df['TeamLaunch'].unique())
print('-----------------------------------------------------------------------------------');
print("HighestPositionHeld", basketball_df['HighestPositionHeld'].unique())
Team ['Team 1' 'Team 2' 'Team 3' 'Team 4' 'Team 5' 'Team 6' 'Team 7' 'Team 8' 'Team 9' 'Team 10' 'Team 11' 'Team 12' 'Team 13' 'Team 14' 'Team 15' 'Team 16' 'Team 17' 'Team 18' 'Team 19' 'Team 20' 'Team 21' 'Team 22' 'Team 23' 'Team 24' 'Team 25' 'Team 26' 'Team 27' 'Team 28' 'Team 29' 'Team 30' 'Team 31' 'Team 32' 'Team 33' 'Team 34' 'Team 35' 'Team 36' 'Team 37' 'Team 38' 'Team 39' 'Team 40' 'Team 41' 'Team 42' 'Team 43' 'Team 44' 'Team 45' 'Team 46' 'Team 47' 'Team 48' 'Team 49' 'Team 50' 'Team 51' 'Team 52' 'Team 53' 'Team 54' 'Team 55' 'Team 56' 'Team 57' 'Team 58' 'Team 59' 'Team 60' 'Team 61'] ----------------------------------------------------------------------------------- Tournament [86 80 82 73 70 58 51 45 42 44 43 37 36 38 27 33 17 23 21 12 20 13 18 11 14 9 7 6 4 5 3 2 1] ----------------------------------------------------------------------------------- Score ['4385' '4262' '3442' '3386' '3368' '2819' '2792' '2573' '2109' '1884' '1814' '1789' '1471' '1416' '1389' '1351' '1314' '1174' '1148' '1020' '970' '667' '662' '606' '553' '538' '510' '445' '421' '416' '375' '353' '343' '293' '285' '277' '242' '230' '190' '188' '168' '150' '148' '132' '107' '96' '91' '83' '81' '76' '71' '56' '52' '42' '40' '35' '34' '22' '19' '14' '-'] ----------------------------------------------------------------------------------- PlayedGames ['2762' '2614' '2664' '2408' '2626' '2302' '1986' '1728' '1530' '1698' '1466' '1428' '1458' '1318' '1255' '1192' '988' '1096' '646' '742' '652' '678' '456' '628' '494' '586' '380' '402' '423' '426' '448' '346' '334' '270' '228' '282' '160' '186' '204' '180' '152' '114' '130' '116' '80' '108' '90' '72' '68' '54' '38' '30' '-'] ----------------------------------------------------------------------------------- WonGames ['1647' '1581' '1241' '1187' '1209' '990' '948' '864' '698' '606' '563' '586' '463' '453' '471' '426' '390' '408' '333' '367' '266' '218' '189' '203' '147' '184' '155' '145' '125' '113' '123' '129' '104' '96' '103' '76' '62' '82' '52' '50' '59' '53' '37' '35' '43' '26' '34' '20' '19' '30' '29' '21' '17' '18' '13' '8' '7' '5' '-'] ----------------------------------------------------------------------------------- DrawnGames ['552' '573' '598' '616' '633' '531' '608' '577' '522' '440' '392' '389' '384' '336' '358' '327' '330' '292' '256' '242' '172' '175' '148' '180' '112' '149' '128' '143' '81' '95' '102' '127' '92' '79' '76' '56' '63' '45' '46' '50' '44' '37' '27' '21' '16' '23' '24' '13' '14' '18' '6' '11' '10' '8' '5' '4' '-'] ----------------------------------------------------------------------------------- LostGames ['563' '608' '775' '861' '920' '887' '1070' '766' '682' '575' '723' '619' '639' '629' '565' '535' '492' '399' '487' '208' '349' '305' '295' '197' '211' '298' '174' '194' '198' '202' '217' '158' '152' '118' '110' '137' '63' '90' '95' '83' '78' '52' '66' '44' '37' '62' '48' '33' '30' '41' '19' '20' '15' '18' '21' '-'] ----------------------------------------------------------------------------------- BasketScored ['5947' '5900' '4534' '4398' '4631' '3680' '3609' '3228' '2683' '2159' '2052' '2278' '1767' '1843' '1753' '1500' '1421' '1642' '1182' '1347' '892' '819' '760' '750' '520' '716' '619' '607' '458' '430' '422' '492' '393' '291' '419' '320' '244' '285' '199' '202' '216' '165' '155' '139' '227' '101' '181' '62' '70' '145' '121' '153' '71' '97' '36' '38' '37' '51' '34' '-'] ----------------------------------------------------------------------------------- BasketGiven ['3140' '3114' '3309' '3469' '3700' '3373' '3889' '3230' '2847' '2492' '2188' '2624' '2180' '2368' '2152' '1834' '1763' '1951' '1371' '1746' '789' '1157' '1088' '1022' '633' '1050' '744' '992' '623' '632' '581' '720' '662' '489' '588' '410' '366' '430' '241' '296' '310' '221' '253' '167' '308' '139' '295' '117' '115' '252' '183' '184' '116' '131' '182' '55' '66' '57' '85' '65' '-'] ----------------------------------------------------------------------------------- TournamentChampion ['33' '25' '10' '6' '8' '1' '-' '2'] ----------------------------------------------------------------------------------- Runner-up ['23' '25' '8' '6' '7' '4' '-' '3' '1' '5'] ----------------------------------------------------------------------------------- TeamLaunch ['1929' '1931to32' '1934-35' '1939-40' '1932-33' '1941to42' '1948-49' '1944_45' '1935-36' '1949_50' '1933to34' '1960-61' '1951-52' '1998-99' '1941-42' '1977-78' '1959-60' '2004to05' '1961-62' '1940-41' '1930-31' '1963-64' '1974-75' '1943-44' '1987-88' '1991_92' '2007-08' '1962-63' '1994-95' '1978-79' '1971-72' '1999to00' '2014-15' '1990-91' '1947-48' '1996-97' '1995-96' '1945-46' '1953-54' '1979-80' '1950-51' '2016_17' '2009-10' '1956-57' '1951~52' '1955-56' '2017~18'] ----------------------------------------------------------------------------------- HighestPositionHeld [ 1 3 2 4 6 8 5 11 7 12 10 17 9 19 14 16 20 15]
There are special character '-' in multiple columns which needs to be eliminated as this is data missing.
Column name 'Runner-up' contains special character '-' which needs to be renamed.
Column name 'TeamLaunch' contains special character '~, _' and also 'to' which needs to be replaced.
Column name 'Team' data set value contains space which needs to be removed and replace the value.
Change datatype of columns except column 'Team' to int64 datatype
# Take a copy of dataframe before making any changes to the value or column name
basketball_df_copy = basketball_df.copy(deep=True)
basketball_df_copy
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | Runner-up | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team 1 | 86 | 4385 | 2762 | 1647 | 552 | 563 | 5947 | 3140 | 33 | 23 | 1929 | 1 |
| 1 | Team 2 | 86 | 4262 | 2762 | 1581 | 573 | 608 | 5900 | 3114 | 25 | 25 | 1929 | 1 |
| 2 | Team 3 | 80 | 3442 | 2614 | 1241 | 598 | 775 | 4534 | 3309 | 10 | 8 | 1929 | 1 |
| 3 | Team 4 | 82 | 3386 | 2664 | 1187 | 616 | 861 | 4398 | 3469 | 6 | 6 | 1931to32 | 1 |
| 4 | Team 5 | 86 | 3368 | 2762 | 1209 | 633 | 920 | 4631 | 3700 | 8 | 7 | 1929 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 56 | Team 57 | 1 | 34 | 38 | 8 | 10 | 20 | 38 | 66 | - | - | 2009-10 | 20 |
| 57 | Team 58 | 1 | 22 | 30 | 7 | 8 | 15 | 37 | 57 | - | - | 1956-57 | 16 |
| 58 | Team 59 | 1 | 19 | 30 | 7 | 5 | 18 | 51 | 85 | - | - | 1951~52 | 16 |
| 59 | Team 60 | 1 | 14 | 30 | 5 | 4 | 21 | 34 | 65 | - | - | 1955-56 | 15 |
| 60 | Team 61 | 1 | - | - | - | - | - | - | - | - | - | 2017~18 | 9 |
61 rows × 13 columns
# Rename column name as the column name contains special character '-'
basketball_df.rename(columns = {'Runner-up': 'RunnerUp'}, inplace = True)
basketball_df.columns
Index(['Team', 'Tournament', 'Score', 'PlayedGames', 'WonGames', 'DrawnGames',
'LostGames', 'BasketScored', 'BasketGiven', 'TournamentChampion',
'RunnerUp', 'TeamLaunch', 'HighestPositionHeld'],
dtype='object')
# Take first 4 values from column 'Teamlaunch' and save the sliced value in same column 'TeamLaunch'
basketball_df['TeamLaunch'] = basketball_df['TeamLaunch'].str.slice(0, 4)
basketball_df['TeamLaunch']
0 1929
1 1929
2 1929
3 1931
4 1929
...
56 2009
57 1956
58 1951
59 1955
60 2017
Name: TeamLaunch, Length: 61, dtype: object
# Replace special characters to 0 in all rows
basketball_df = basketball_df.replace('-', 0)
basketball_df
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | RunnerUp | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team 1 | 86 | 4385 | 2762 | 1647 | 552 | 563 | 5947 | 3140 | 33 | 23 | 1929 | 1 |
| 1 | Team 2 | 86 | 4262 | 2762 | 1581 | 573 | 608 | 5900 | 3114 | 25 | 25 | 1929 | 1 |
| 2 | Team 3 | 80 | 3442 | 2614 | 1241 | 598 | 775 | 4534 | 3309 | 10 | 8 | 1929 | 1 |
| 3 | Team 4 | 82 | 3386 | 2664 | 1187 | 616 | 861 | 4398 | 3469 | 6 | 6 | 1931 | 1 |
| 4 | Team 5 | 86 | 3368 | 2762 | 1209 | 633 | 920 | 4631 | 3700 | 8 | 7 | 1929 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 56 | Team 57 | 1 | 34 | 38 | 8 | 10 | 20 | 38 | 66 | 0 | 0 | 2009 | 20 |
| 57 | Team 58 | 1 | 22 | 30 | 7 | 8 | 15 | 37 | 57 | 0 | 0 | 1956 | 16 |
| 58 | Team 59 | 1 | 19 | 30 | 7 | 5 | 18 | 51 | 85 | 0 | 0 | 1951 | 16 |
| 59 | Team 60 | 1 | 14 | 30 | 5 | 4 | 21 | 34 | 65 | 0 | 0 | 1955 | 15 |
| 60 | Team 61 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2017 | 9 |
61 rows × 13 columns
# Remove the space in column value 'Team'
basketball_df['Team'] = basketball_df['Team'].str.replace('Team ', 'Team');
basketball_df['Team']
0 Team1
1 Team2
2 Team3
3 Team4
4 Team5
...
56 Team57
57 Team58
58 Team59
59 Team60
60 Team61
Name: Team, Length: 61, dtype: object
# Change datatype to int64 for below mentioned columns
columns = ['Tournament', 'Score', 'PlayedGames', 'WonGames', 'DrawnGames',
'LostGames', 'BasketScored', 'BasketGiven', 'TournamentChampion',
'RunnerUp', 'HighestPositionHeld']
basketball_df[columns] = basketball_df[columns].apply(pd.to_numeric, errors='coerce')
basketball_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Team 61 non-null object 1 Tournament 61 non-null int64 2 Score 61 non-null int64 3 PlayedGames 61 non-null int64 4 WonGames 61 non-null int64 5 DrawnGames 61 non-null int64 6 LostGames 61 non-null int64 7 BasketScored 61 non-null int64 8 BasketGiven 61 non-null int64 9 TournamentChampion 61 non-null int64 10 RunnerUp 61 non-null int64 11 TeamLaunch 61 non-null object 12 HighestPositionHeld 61 non-null int64 dtypes: int64(11), object(2) memory usage: 6.3+ KB
# Data cleaning is completed. Data set will look like below.
basketball_df.tail()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | RunnerUp | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56 | Team57 | 1 | 34 | 38 | 8 | 10 | 20 | 38 | 66 | 0 | 0 | 2009 | 20 |
| 57 | Team58 | 1 | 22 | 30 | 7 | 8 | 15 | 37 | 57 | 0 | 0 | 1956 | 16 |
| 58 | Team59 | 1 | 19 | 30 | 7 | 5 | 18 | 51 | 85 | 0 | 0 | 1951 | 16 |
| 59 | Team60 | 1 | 14 | 30 | 5 | 4 | 21 | 34 | 65 | 0 | 0 | 1955 | 15 |
| 60 | Team61 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2017 | 9 |
# Creating new column WonGamesPercentage.
basketball_df['WonGamesPercentage'] = basketball_df['WonGames']/basketball_df['PlayedGames']
basketball_df['WonGamesPercentage']
0 0.596307
1 0.572411
2 0.474751
3 0.445571
4 0.437726
...
56 0.210526
57 0.233333
58 0.233333
59 0.166667
60 NaN
Name: WonGamesPercentage, Length: 61, dtype: float64
# Creating new column DrawnGamesPercentage.
basketball_df['DrawnGamesPercentage'] = round(basketball_df['DrawnGames'] / basketball_df['PlayedGames'], 2)
basketball_df['DrawnGamesPercentage']
0 0.20
1 0.21
2 0.23
3 0.23
4 0.23
...
56 0.26
57 0.27
58 0.17
59 0.13
60 NaN
Name: DrawnGamesPercentage, Length: 61, dtype: float64
# Creating new column LostGamesPercentage.
basketball_df['LostGamesPercentage'] = round(basketball_df['LostGames'] / basketball_df['PlayedGames'], 2)
basketball_df['LostGamesPercentage']
0 0.20
1 0.22
2 0.30
3 0.32
4 0.33
...
56 0.53
57 0.50
58 0.60
59 0.70
60 NaN
Name: LostGamesPercentage, Length: 61, dtype: float64
# Replace the Nan values with zero in the dataframe
basketball_df.replace(np.nan, 0, inplace=True)
basketball_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Team 61 non-null object 1 Tournament 61 non-null int64 2 Score 61 non-null int64 3 PlayedGames 61 non-null int64 4 WonGames 61 non-null int64 5 DrawnGames 61 non-null int64 6 LostGames 61 non-null int64 7 BasketScored 61 non-null int64 8 BasketGiven 61 non-null int64 9 TournamentChampion 61 non-null int64 10 RunnerUp 61 non-null int64 11 TeamLaunch 61 non-null object 12 HighestPositionHeld 61 non-null int64 13 WonGamesPercentage 61 non-null float64 14 DrawnGamesPercentage 61 non-null float64 15 LostGamesPercentage 61 non-null float64 dtypes: float64(3), int64(11), object(2) memory usage: 7.8+ KB
basketball_df.tail()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | RunnerUp | TeamLaunch | HighestPositionHeld | WonGamesPercentage | DrawnGamesPercentage | LostGamesPercentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56 | Team57 | 1 | 34 | 38 | 8 | 10 | 20 | 38 | 66 | 0 | 0 | 2009 | 20 | 0.210526 | 0.26 | 0.53 |
| 57 | Team58 | 1 | 22 | 30 | 7 | 8 | 15 | 37 | 57 | 0 | 0 | 1956 | 16 | 0.233333 | 0.27 | 0.50 |
| 58 | Team59 | 1 | 19 | 30 | 7 | 5 | 18 | 51 | 85 | 0 | 0 | 1951 | 16 | 0.233333 | 0.17 | 0.60 |
| 59 | Team60 | 1 | 14 | 30 | 5 | 4 | 21 | 34 | 65 | 0 | 0 | 1955 | 15 | 0.166667 | 0.13 | 0.70 |
| 60 | Team61 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2017 | 9 | 0.000000 | 0.00 | 0.00 |
Hint: Use statistical techniques and visualisation techniques to come up with useful metrics and reporting. Find out the best performing team, oldest team, team with highest goals, team with lowest performance etc. and many more. These are just random examples. Please use your best analytical approach to build this report. You can mix match columns to create new ones which can be used for better analysis. Create your own features if required. Be highly experimental and analytical here to find hidden patterns. Use graphical interactive libraries to enable you to publish interactive plots in python.
basketball_df.describe(include = "all").T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Team | 61 | 61 | Team1 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Tournament | 61.0 | NaN | NaN | NaN | 24.0 | 26.827225 | 1.0 | 4.0 | 12.0 | 38.0 | 86.0 |
| Score | 61.0 | NaN | NaN | NaN | 901.42623 | 1134.899121 | 0.0 | 96.0 | 375.0 | 1351.0 | 4385.0 |
| PlayedGames | 61.0 | NaN | NaN | NaN | 796.819672 | 876.282765 | 0.0 | 114.0 | 423.0 | 1318.0 | 2762.0 |
| WonGames | 61.0 | NaN | NaN | NaN | 303.967213 | 406.99103 | 0.0 | 34.0 | 123.0 | 426.0 | 1647.0 |
| DrawnGames | 61.0 | NaN | NaN | NaN | 188.934426 | 201.799477 | 0.0 | 24.0 | 95.0 | 330.0 | 633.0 |
| LostGames | 61.0 | NaN | NaN | NaN | 303.754098 | 294.708594 | 0.0 | 62.0 | 197.0 | 563.0 | 1070.0 |
| BasketScored | 61.0 | NaN | NaN | NaN | 1140.344262 | 1506.740211 | 0.0 | 153.0 | 430.0 | 1642.0 | 5947.0 |
| BasketGiven | 61.0 | NaN | NaN | NaN | 1140.229508 | 1163.710766 | 0.0 | 221.0 | 632.0 | 1951.0 | 3889.0 |
| TournamentChampion | 61.0 | NaN | NaN | NaN | 1.42623 | 5.472535 | 0.0 | 0.0 | 0.0 | 0.0 | 33.0 |
| RunnerUp | 61.0 | NaN | NaN | NaN | 1.409836 | 4.540107 | 0.0 | 0.0 | 0.0 | 0.0 | 25.0 |
| TeamLaunch | 61 | 45 | 1929 | 10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| HighestPositionHeld | 61.0 | NaN | NaN | NaN | 7.081967 | 5.276663 | 1.0 | 3.0 | 6.0 | 10.0 | 20.0 |
| WonGamesPercentage | 61.0 | NaN | NaN | NaN | 0.308506 | 0.087426 | 0.0 | 0.27193 | 0.302817 | 0.334854 | 0.596307 |
| DrawnGamesPercentage | 61.0 | NaN | NaN | NaN | 0.232623 | 0.054034 | 0.0 | 0.22 | 0.24 | 0.26 | 0.39 |
| LostGamesPercentage | 61.0 | NaN | NaN | NaN | 0.442623 | 0.101422 | 0.0 | 0.41 | 0.46 | 0.49 | 0.7 |
basketball_df.skew()
C:\Users\Bhavya Govindrao\AppData\Local\Temp\ipykernel_8420\1534467484.py:1: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. basketball_df.skew()
Tournament 1.217038 Score 1.593109 PlayedGames 1.141978 WonGames 1.805728 DrawnGames 1.004159 LostGames 0.897130 BasketScored 1.777436 BasketGiven 0.975859 TournamentChampion 4.777021 RunnerUp 4.360643 TeamLaunch 0.672956 HighestPositionHeld 0.817976 WonGamesPercentage 0.404140 DrawnGamesPercentage -1.434088 LostGamesPercentage -1.448581 dtype: float64
# Univariate analysis Numerical columns - Distribution Plots
rows = 3
cols = 5
iterator = 1
plt.figure(figsize=(10, 5))
columns = basketball_df.select_dtypes(include = np.number).columns
for i, column in enumerate(basketball_df[columns], iterator):
plt.subplot(rows, cols, i)
sns.histplot(basketball_df[column], kde = True)
plt.title(column)
plt.xticks(rotation = 90)
plt.tight_layout()
plt.show()
Observations :
1. 'Tournament', 'Score', 'PlayedGames', 'WonGames', 'DrawnGames',
'LostGames', 'BasketScored', 'BasketGiven', 'TournamentChampion',
'RunnerUp', 'TeamLaunch', 'HighestPositionHeld' are Right skewed.
2. 'WonGames', 'DrawnGames', 'LostGames' are left skewed.
# Univariate analysis Numerical columns - Box Plots
rows = 3
cols = 5
iterator = 1
plt.figure(figsize=(10, 5))
columns = basketball_df.select_dtypes(include = np.number).columns
for i, column in enumerate(basketball_df[columns], iterator):
plt.subplot(rows, cols, i)
sns.boxplot(basketball_df[column])
plt.title(column)
plt.xticks(rotation = 90)
plt.tight_layout()
plt.show()
Observations :
1. 'Tournament', 'Score', 'PlayedGames', 'WonGames', 'DrawnGames',
'LostGames', 'BasketScored', 'BasketGiven', 'TournamentChampion',
'RunnerUp', 'TeamLaunch', 'HighestPositionHeld' are Right skewed.
2. 'WonGames', 'DrawnGames', 'LostGames' are left skewed.
sns.lineplot(x = 'Team', y = 'PlayedGames', data=basketball_df.head(25), color='blue')
sns.lineplot(x = 'Team', y = 'WonGames', data=basketball_df.head(25), color='red')
sns.lineplot(x = 'Team', y = 'DrawnGames', data=basketball_df.head(25), color='green')
sns.lineplot(x = 'Team', y = 'LostGames', data=basketball_df.head(25), color='orange')
sns.lineplot(x = 'Team', y = 'BasketScored', data=basketball_df.head(25), color='black')
sns.lineplot(x = 'Team', y = 'BasketGiven', data=basketball_df.head(25), color='purple')
sns.lineplot(x = 'Team', y = 'RunnerUp', data=basketball_df.head(25), color='pink')
sns.lineplot(x = 'Team', y = 'HighestPositionHeld', data=basketball_df.head(25), color='grey')
sns.lineplot(x = 'Team', y = 'WonGamesPercentage', data=basketball_df.head(25), color='chocolate')
sns.lineplot(x = 'Team', y = 'DrawnGamesPercentage', data=basketball_df.head(25), color='yellow')
sns.lineplot(x = 'Team', y = 'LostGamesPercentage', data=basketball_df.head(25), color='cyan')
sns.lineplot(x = 'Team', y = 'Score', data=basketball_df.head(25), color='brown')
plt.grid()
plt.xticks(rotation=90)
plt.legend(labels=['PlayedGames', 'WonGames', 'DrawnGames', 'LostGames','BasketScored','BasketGiven','RunnerUp','HighestPositionHeld','WonGamesPercentage','DrawnGamesPercentage','LostGamesPercentage','Score'])
plt.show()
# Univariate analysis Categorical columns - Count/Bar Plots
rows = 2
cols = 1
iterator = 1
plt.figure(figsize=(10,5))
columns = basketball_df.select_dtypes(include = 'object').columns
for i, column in enumerate(basketball_df[columns], iterator):
plt.subplot(rows, cols, i)
sns.countplot(data = basketball_df, x = column)
plt.title(column)
plt.xticks(rotation = 90)
plt.tight_layout()
plt.show()
Observations :
1. There are total 61 teams.
2. In 1929, there are total of 10 teams were launched.
3. In 1941, there are total of 3 teams were launched.
4. In each year minimum of 1 team launched the team.
# Bivariate analysis Numerical columns vs Categorical columns (Team) - Bar Plots
px.bar(data_frame=basketball_df, x='Team', y='Tournament', color='Team')
Observations :
1. There are total 61 teams.
2. Team1, Team2, Team5 won 86 Tournment.
3. Tournament data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='Score', color='Team')
Observations :
1. There are total 61 teams.
2. Team1 scored 4385 highest, Team2 scored 4262 as second highest score.
3. Score data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='PlayedGames', color='Team')
Observations :
1. There are total 61 teams.
2. Team1, Team2, Team3 played 2762 games.
3. PlayedGames data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='WonGames', color='Team')
Observations :
1. There are total 61 teams.
2. Team1 won 1647 highest, Team2 won 1581 as second highest.
3. WonGames data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='DrawnGames', color='Team')
Observations :
1. There are total 61 teams.
2. Team5 drawn 633 highest, Team4 drawn 612 as second highest.
3. DrawnGames data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='LostGames', color='Team')
Observations :
1. There are total 61 teams.
2. Team7 lost 1070 highest, Team5 lost 920 as second highest.
3. LostGames data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='BasketScored', color='Team')
Observations :
1. There are total 61 teams.
2. Team1 scored 5947 highest, Team2 scored 5900 as second highest.
3. BasketScored data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='BasketGiven', color='Team')
Observations :
1. There are total 61 teams.
2. Team7 given 3889 highest, Team5 given 3700 as second highest.
3. BasketGiven data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='TournamentChampion', color='Team')
Observations :
1. There are total 61 teams.
2. Team1 tournamentchampion 33 highest, Team2 tournamentchampion 25 as second highest.
3. TournamentChampion data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='RunnerUp', color='Team')
Observations :
1. There are total 61 teams.
2. Team2 runnerup 25 highest, Team1 runnerup 23 as second highest.
3. RunnerUp data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='HighestPositionHeld', color='Team')
Observations :
1. There are total 61 teams.
2. Team57 highestpositionheld 33 highest, Team49 highestpositionheld 25 as second highest.
3. HighestPositionHeld data looks left skewed.
px.bar(data_frame=basketball_df, x='Team', y='WonGamesPercentage', color='Team')
Observations :
1. There are total 61 teams.
2. Team1 won games %age 0.59% highest, Team2 won games %age 0.57% as second highest.
3. WonGamesPercentage data looks right skewed.
px.bar(data_frame=basketball_df, x='Team', y='DrawnGamesPercentage', color='Team')
Observations :
1. There are total 61 teams.
2. Team46 drawn games %age 0.39% highest, Team49 won games %age 0.30% as second highest.
3. DrawnGamesPercentage data looks left skewed.
px.bar(data_frame=basketball_df, x='Team', y='LostGamesPercentage', color='Team')
Observations :
1. There are total 61 teams.
2. Team60 lost games %age 0.70% highest, Team55 and Team59 lost games %age 0.60% as second highest.
3. LostGamesPercentage data looks left skewed.
basketball_df.sort_values(by = ['WonGamesPercentage'], ascending=False).head(10)
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | RunnerUp | TeamLaunch | HighestPositionHeld | WonGamesPercentage | DrawnGamesPercentage | LostGamesPercentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team1 | 86 | 4385 | 2762 | 1647 | 552 | 563 | 5947 | 3140 | 33 | 23 | 1929 | 1 | 0.596307 | 0.20 | 0.20 |
| 1 | Team2 | 86 | 4262 | 2762 | 1581 | 573 | 608 | 5900 | 3114 | 25 | 25 | 1929 | 1 | 0.572411 | 0.21 | 0.22 |
| 2 | Team3 | 80 | 3442 | 2614 | 1241 | 598 | 775 | 4534 | 3309 | 10 | 8 | 1929 | 1 | 0.474751 | 0.23 | 0.30 |
| 3 | Team4 | 82 | 3386 | 2664 | 1187 | 616 | 861 | 4398 | 3469 | 6 | 6 | 1931 | 1 | 0.445571 | 0.23 | 0.32 |
| 4 | Team5 | 86 | 3368 | 2762 | 1209 | 633 | 920 | 4631 | 3700 | 8 | 7 | 1929 | 1 | 0.437726 | 0.23 | 0.33 |
| 20 | Team21 | 17 | 970 | 646 | 266 | 172 | 208 | 892 | 789 | 0 | 1 | 1998 | 2 | 0.411765 | 0.27 | 0.32 |
| 5 | Team6 | 73 | 2819 | 2408 | 990 | 531 | 887 | 3680 | 3373 | 1 | 4 | 1934 | 1 | 0.411130 | 0.22 | 0.37 |
| 7 | Team8 | 70 | 2573 | 2302 | 864 | 577 | 861 | 3228 | 3230 | 2 | 3 | 1929 | 1 | 0.375326 | 0.25 | 0.37 |
| 10 | Team11 | 45 | 1814 | 1530 | 563 | 392 | 575 | 2052 | 2188 | 1 | 5 | 1941 | 1 | 0.367974 | 0.26 | 0.38 |
| 6 | Team7 | 82 | 2792 | 2626 | 948 | 608 | 1070 | 3609 | 3889 | 0 | 0 | 1929 | 3 | 0.361005 | 0.23 | 0.41 |
Observations :
1. There are total 61 teams.
2. Team1 won games %age 0.59% highest, Team2 won games %age 0.57% as second highest.
3. WonGamesPercentage data looks right skewed.
basketball_df.sort_values(by = ['DrawnGamesPercentage'], ascending=False).head(10)
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | RunnerUp | TeamLaunch | HighestPositionHeld | WonGamesPercentage | DrawnGamesPercentage | LostGamesPercentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 45 | Team46 | 3 | 96 | 114 | 26 | 44 | 44 | 101 | 139 | 0 | 0 | 1990 | 9 | 0.228070 | 0.39 | 0.39 |
| 48 | Team49 | 2 | 81 | 80 | 19 | 24 | 37 | 70 | 115 | 0 | 0 | 1995 | 19 | 0.237500 | 0.30 | 0.46 |
| 55 | Team56 | 1 | 35 | 38 | 8 | 11 | 19 | 36 | 55 | 0 | 0 | 2016 | 17 | 0.210526 | 0.29 | 0.50 |
| 47 | Team48 | 2 | 83 | 80 | 20 | 23 | 37 | 62 | 117 | 0 | 0 | 1996 | 17 | 0.250000 | 0.29 | 0.46 |
| 32 | Team33 | 12 | 343 | 448 | 104 | 127 | 217 | 393 | 662 | 0 | 0 | 1977 | 12 | 0.232143 | 0.28 | 0.48 |
| 35 | Team36 | 7 | 277 | 270 | 76 | 76 | 118 | 320 | 410 | 0 | 0 | 1991 | 7 | 0.281481 | 0.28 | 0.44 |
| 38 | Team39 | 4 | 190 | 160 | 52 | 45 | 63 | 199 | 241 | 0 | 0 | 1994 | 10 | 0.325000 | 0.28 | 0.39 |
| 33 | Team34 | 9 | 293 | 346 | 96 | 92 | 158 | 291 | 489 | 0 | 0 | 1987 | 7 | 0.277457 | 0.27 | 0.46 |
| 23 | Team24 | 21 | 606 | 678 | 203 | 180 | 295 | 750 | 1022 | 0 | 0 | 1959 | 5 | 0.299410 | 0.27 | 0.44 |
| 57 | Team58 | 1 | 22 | 30 | 7 | 8 | 15 | 37 | 57 | 0 | 0 | 1956 | 16 | 0.233333 | 0.27 | 0.50 |
Observations :
1. There are total 61 teams.
2. Team46 drawn games %age 0.39% highest, Team49 won games %age 0.30% as second highest.
3. DrawnGamesPercentage data looks left skewed.
basketball_df.sort_values(by = ['LostGamesPercentage'], ascending=False).head(10)
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | RunnerUp | TeamLaunch | HighestPositionHeld | WonGamesPercentage | DrawnGamesPercentage | LostGamesPercentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 59 | Team60 | 1 | 14 | 30 | 5 | 4 | 21 | 34 | 65 | 0 | 0 | 1955 | 15 | 0.166667 | 0.13 | 0.70 |
| 54 | Team55 | 2 | 40 | 68 | 13 | 14 | 41 | 70 | 182 | 0 | 0 | 1950 | 16 | 0.191176 | 0.21 | 0.60 |
| 58 | Team59 | 1 | 19 | 30 | 7 | 5 | 18 | 51 | 85 | 0 | 0 | 1951 | 16 | 0.233333 | 0.17 | 0.60 |
| 49 | Team50 | 4 | 76 | 108 | 30 | 16 | 62 | 145 | 252 | 0 | 0 | 1945 | 10 | 0.277778 | 0.15 | 0.57 |
| 46 | Team47 | 4 | 91 | 116 | 34 | 16 | 66 | 181 | 295 | 0 | 0 | 1947 | 7 | 0.293103 | 0.14 | 0.57 |
| 53 | Team54 | 3 | 42 | 54 | 18 | 6 | 30 | 97 | 131 | 0 | 0 | 1929 | 8 | 0.333333 | 0.11 | 0.56 |
| 56 | Team57 | 1 | 34 | 38 | 8 | 10 | 20 | 38 | 66 | 0 | 0 | 2009 | 20 | 0.210526 | 0.26 | 0.53 |
| 50 | Team51 | 3 | 71 | 90 | 29 | 13 | 48 | 121 | 183 | 0 | 0 | 1953 | 14 | 0.322222 | 0.14 | 0.53 |
| 51 | Team52 | 4 | 56 | 72 | 21 | 14 | 37 | 153 | 184 | 0 | 0 | 1929 | 6 | 0.291667 | 0.19 | 0.51 |
| 27 | Team28 | 18 | 445 | 586 | 145 | 143 | 298 | 607 | 992 | 0 | 0 | 1940 | 11 | 0.247440 | 0.24 | 0.51 |
Observations :
1. There are total 61 teams.
2. Team60 lost games %age 0.70% highest, Team55 and Team59 lost games %age 0.60% as second highest.
3. LostGamesPercentage data looks left skewed.
# Bivariate analysis Categorical columns vs Categorical columns - Crosstab Plots
pd.crosstab(basketball_df['Team'], basketball_df['TeamLaunch'])
| TeamLaunch | 1929 | 1930 | 1931 | 1932 | 1933 | 1934 | 1935 | 1939 | 1940 | 1941 | ... | 1995 | 1996 | 1998 | 1999 | 2004 | 2007 | 2009 | 2014 | 2016 | 2017 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Team | |||||||||||||||||||||
| Team1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Team10 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Team11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Team12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Team13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Team60 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Team61 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| Team7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Team8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Team9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
61 rows × 45 columns
Observations :
1. Bivariate anaylsis plots the important categorical metrics.
2. Plotting 2 categorical variables, Team and TeamLaunch.
# Multivariate analysis Numerical columns - Heatmap Plots
sns.heatmap(basketball_df.corr(), annot=True, cmap='autumn')
plt.show()
C:\Users\Bhavya Govindrao\AppData\Local\Temp\ipykernel_8420\1483333238.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
Quality: For some columns like Team61, we have incomplete information where none of the records available except TeamLaunch and HighestPositionHeld columns. As this information is not enough to apply any strategies.
Quantity: Since our dataset had only 61 entries, we couldn't figure out more trends. We can include information about the players of each team, at least of the captains. It would have given us some information about the team's performance under different captains.
Variety: To analyze a bastball team other then the given information , more subject related data can be added . Example: HighestPositionHeld column . Dont have target value to analyze team.
Velocity: efficient speed of data collection can be achieved if we maintain a regular record of the games, keep adding data after every match.
Veracity: Our data set is small which contains onlt 61 teams, it has out liers and missing information. With these, Eventually it effects the reliability of its results.
Data set has outdated information check as we ahev few old teams information like Team58, Team59, Team60 which team launched on 1950's and played less games.
Teamwise data prediction may vary because it will be based on players. So better we gather more information about teams and its players.
We can add more identifiers to the data about team. More the identifiers will help us to analyze teams accurately.
• DOMAIN: Startup ecosystem
• CONTEXT: Company X is a EU online publisher focusing on the startups industry. The company specifically reports on the business related to technology news, analysis of emerging trends and profiling of new tech businesses and products. Their event i.e. Startup Battlefield is the world’s pre-eminent startup competition. Startup Battlefield features 15-30 top early stage startups pitching top judges in front of a vast live audience, present in person and online.
• DATA DESCRIPTION: CompanyX_EU.csv - Each row in the dataset is a Start-up company and the columns describe the company.
• DATA DICTIONARY:
1. Startup: Name of the company
2. Product: Actual product
3. Funding: Funds raised by the company in USD
4. Event: The event the company participated in
5. Result: Described by Contestant, Finalist, Audience choice, Winner or Runner up
6. OperatingState: Current status of the company, Operating ,Closed, Acquired or IPO
*Dataset has been downloaded from the internet. All the credit for the dataset goes to the original creator of the data.
• PROJECT OBJECTIVE: Analyse the data of the various companies from the given dataset and perform the tasks that are specified in the below steps. Draw insights from the various attributes that are present in the dataset, plot distributions, state hypotheses and draw conclusions from the dataset.
• STEPS AND TASK [15 Marks]:
# Read the data set
company_df = pd.read_csv('CompanyX_EU.csv');
company_df
| Startup | Product | Funding | Event | Result | OperatingState | |
|---|---|---|---|---|---|---|
| 0 | 2600Hz | 2600hz.com | NaN | Disrupt SF 2013 | Contestant | Operating |
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed |
| 2 | 3DPrinterOS | 3dprinteros.com | NaN | Disrupt SF 2016 | Contestant | Operating |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating |
| 4 | 42 Technologies | 42technologies.com | NaN | Disrupt NYC 2013 | Contestant | Operating |
| ... | ... | ... | ... | ... | ... | ... |
| 657 | Zivity | zivity.com | $8M | TC40 2007 | Contestant | Operating |
| 658 | Zmorph | zmorph3d.com | $1M | - | Audience choice | Operating |
| 659 | Zocdoc | zocdoc.com | $223M | TC40 2007 | Contestant | Operating |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating |
662 rows × 6 columns
A. Check the datatypes of each attribute.
company_df.dtypes
Startup object Product object Funding object Event object Result object OperatingState object dtype: object
B. Check for null values in the attributes.
company_df.isnull()
| Startup | Product | Funding | Event | Result | OperatingState | |
|---|---|---|---|---|---|---|
| 0 | False | False | True | False | False | False |
| 1 | False | False | False | False | False | False |
| 2 | False | False | True | False | False | False |
| 3 | False | False | False | False | False | False |
| 4 | False | False | True | False | False | False |
| ... | ... | ... | ... | ... | ... | ... |
| 657 | False | False | False | False | False | False |
| 658 | False | False | False | False | False | False |
| 659 | False | False | False | False | False | False |
| 660 | False | False | False | False | False | False |
| 661 | False | False | False | False | False | False |
662 rows × 6 columns
company_df.isna().sum()
Startup 0 Product 6 Funding 214 Event 0 Result 0 OperatingState 0 dtype: int64
A. Drop the null values. [1 Mark]
company_df.dropna(inplace = True)
company_df
| Startup | Product | Funding | Event | Result | OperatingState | |
|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating |
| 5 | 5to1 | 5to1.com | $19.3M | TC50 2009 | Contestant | Acquired |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating |
| 10 | AdhereTech | adheretech.com | $1.8M | Hardware Battlefield 2014 | Contestant | Operating |
| ... | ... | ... | ... | ... | ... | ... |
| 657 | Zivity | zivity.com | $8M | TC40 2007 | Contestant | Operating |
| 658 | Zmorph | zmorph3d.com | $1M | - | Audience choice | Operating |
| 659 | Zocdoc | zocdoc.com | $223M | TC40 2007 | Contestant | Operating |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating |
446 rows × 6 columns
Observations :
B. Convert the ‘Funding’ features to a numerical value.
(Execute below code)
df1.loc[:,'Funds_in_million'] = df1['Funding'].apply(lambda x: float(x[1:-1])/1000 if x[-1] == 'K' else (float(x[1:-1])*1000 if x[-1] == 'B' else float(x[1:-1])))
company_df.loc[:,'Funds_in_million'] = company_df['Funding'].apply(lambda x: float(x[1:-1])/1000 if x[-1] == 'K' else (float(x[1:-1])*1000 if x[-1] == 'B' else float(x[1:-1])))
company_df
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | |
|---|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed | 0.63 |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating | 1.00 |
| 5 | 5to1 | 5to1.com | $19.3M | TC50 2009 | Contestant | Acquired | 19.30 |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating | 29.00 |
| 10 | AdhereTech | adheretech.com | $1.8M | Hardware Battlefield 2014 | Contestant | Operating | 1.80 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 657 | Zivity | zivity.com | $8M | TC40 2007 | Contestant | Operating | 8.00 |
| 658 | Zmorph | zmorph3d.com | $1M | - | Audience choice | Operating | 1.00 |
| 659 | Zocdoc | zocdoc.com | $223M | TC40 2007 | Contestant | Operating | 223.00 |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating | 3.40 |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating | 31.50 |
446 rows × 7 columns
C. Plot box plot for funds in million. [1 Mark]
box_plot = sns.boxplot(x = company_df['Funds_in_million'])
box_plot
<Axes: xlabel='Funds_in_million'>
D. Check the number of outliers greater than the upper fence. [1 Mark]
def find_outliers_IQR(company_df):
q1 = company_df.quantile(0.25)
q3 = company_df.quantile(0.75)
IQR = q3-q1
outliers = company_df[((company_df<(q1-1.5*IQR)) | (company_df>(q3+1.5*IQR)))]
return outliers
outliers = find_outliers_IQR(company_df["Funds_in_million"])
print('Number of outliers: ' + str(len(outliers)))
print('Max outlier value: ' + str(outliers.max()))
print('Min outlier value: ' + str(outliers.min()))
outliers
Number of outliers: 60 Max outlier value: 1700.0 Min outlier value: 24.0
6 29.0 31 24.0 40 50.9 49 40.0 56 205.0 108 32.5 113 182.1 128 35.4 130 38.0 132 37.1 138 72.0 139 64.0 154 1700.0 166 34.6 172 35.5 179 30.1 180 26.0 188 66.0 191 28.3 209 103.0 213 25.9 215 36.5 225 44.7 231 24.2 271 122.4 276 34.9 279 67.8 282 166.1 302 42.1 305 69.0 313 65.1 325 25.1 346 31.8 389 37.0 393 38.5 394 25.5 398 24.0 427 278.0 432 24.0 435 63.0 460 25.0 471 160.0 546 40.0 555 47.4 560 24.5 581 25.0 593 332.4 598 35.0 606 168.8 615 28.0 625 30.8 643 41.8 644 142.0 647 117.8 649 26.0 650 28.0 654 62.1 656 583.6 659 223.0 661 31.5 Name: Funds_in_million, dtype: float64
# To cap the outliers, calculate a upper limit and lower limit. For the upper limit, we will use the mean plus three
# standard deviations. For the lower limit, we will calculate it as the mean minus 3 standard deviations. Keep in mind,
# the calculation you use can depend on the data’s distribution.
upper_limit = company_df['Funds_in_million'].mean() + 3*company_df['Funds_in_million'].std()
print('upper_limit: ' + str(upper_limit))
lower_limit = company_df['Funds_in_million'].mean() - 3*company_df['Funds_in_million'].std()
print('lower_limit: ' + str(lower_limit))
upper_limit: 288.69261876000576 lower_limit: -254.20964162996088
company_df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Funds_in_million | 446.0 | 17.241489 | 90.48371 | 0.005 | 0.74525 | 2.2 | 9.475 | 1700.0 |
# After calculating the upper and lower limit, we use the numpy .where() function to apply the limits to 'Funds_in_million'.
# company_df['Funds_in_million'] = np.where(company_df['Funds_in_million'] > upper_limit,
# upper_limit,
# np.where(company_df['Funds_in_million'] < lower_limit,
# lower_limit,
# company_df['Funds_in_million']
# )
# )
company_df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Funds_in_million | 446.0 | 17.241489 | 90.48371 | 0.005 | 0.74525 | 2.2 | 9.475 | 1700.0 |
E. Check frequency of the OperatingState features classes. [1 Mark]
company_df['OperatingState'].value_counts()
Operating 319 Acquired 66 Closed 57 Ipo 4 Name: OperatingState, dtype: int64
Observations:
A. Is there any significant difference between Funds raised by companies that are still operating vs companies that closed down? [1 Mark]
company_df.groupby('OperatingState').sum()
C:\Users\Bhavya Govindrao\AppData\Local\Temp\ipykernel_8420\707961184.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
| Funds_in_million | |
|---|---|
| OperatingState | |
| Acquired | 872.0510 |
| Closed | 185.7157 |
| Ipo | 551.1000 |
| Operating | 6080.8372 |
Observations:
B. Write the null hypothesis and alternative hypothesis. [1 Mark]
# Create new data set with null which fills 0 with Nan
company_df_with_null = pd.read_csv('CompanyX_EU.csv')
company_df_with_null = company_df_with_null.fillna(0)
company_df_with_null
| Startup | Product | Funding | Event | Result | OperatingState | |
|---|---|---|---|---|---|---|
| 0 | 2600Hz | 2600hz.com | 0 | Disrupt SF 2013 | Contestant | Operating |
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed |
| 2 | 3DPrinterOS | 3dprinteros.com | 0 | Disrupt SF 2016 | Contestant | Operating |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating |
| 4 | 42 Technologies | 42technologies.com | 0 | Disrupt NYC 2013 | Contestant | Operating |
| ... | ... | ... | ... | ... | ... | ... |
| 657 | Zivity | zivity.com | $8M | TC40 2007 | Contestant | Operating |
| 658 | Zmorph | zmorph3d.com | $1M | - | Audience choice | Operating |
| 659 | Zocdoc | zocdoc.com | $223M | TC40 2007 | Contestant | Operating |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating |
662 rows × 6 columns
# Create new data set without null by dropping Nan
company_df_without_null = pd.read_csv('CompanyX_EU.csv')
company_df_without_null = company_df_without_null.dropna()
company_df_without_null
| Startup | Product | Funding | Event | Result | OperatingState | |
|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating |
| 5 | 5to1 | 5to1.com | $19.3M | TC50 2009 | Contestant | Acquired |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating |
| 10 | AdhereTech | adheretech.com | $1.8M | Hardware Battlefield 2014 | Contestant | Operating |
| ... | ... | ... | ... | ... | ... | ... |
| 657 | Zivity | zivity.com | $8M | TC40 2007 | Contestant | Operating |
| 658 | Zmorph | zmorph3d.com | $1M | - | Audience choice | Operating |
| 659 | Zocdoc | zocdoc.com | $223M | TC40 2007 | Contestant | Operating |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating |
446 rows × 6 columns
company_df
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | |
|---|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed | 0.63 |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating | 1.00 |
| 5 | 5to1 | 5to1.com | $19.3M | TC50 2009 | Contestant | Acquired | 19.30 |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating | 29.00 |
| 10 | AdhereTech | adheretech.com | $1.8M | Hardware Battlefield 2014 | Contestant | Operating | 1.80 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 657 | Zivity | zivity.com | $8M | TC40 2007 | Contestant | Operating | 8.00 |
| 658 | Zmorph | zmorph3d.com | $1M | - | Audience choice | Operating | 1.00 |
| 659 | Zocdoc | zocdoc.com | $223M | TC40 2007 | Contestant | Operating | 223.00 |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating | 3.40 |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating | 31.50 |
446 rows × 7 columns
company_df_with_null.nunique()
Startup 662 Product 657 Funding 241 Event 26 Result 5 OperatingState 4 dtype: int64
company_df_without_null.nunique()
Startup 446 Product 446 Funding 239 Event 26 Result 5 OperatingState 4 dtype: int64
company_df.nunique()
Startup 446 Product 446 Funding 239 Event 26 Result 5 OperatingState 4 Funds_in_million 239 dtype: int64
With null which are filled to 0 for Nan, independent column 'Funding' is 241. But without null where Nan are dropped, in this scenario independent column 'Funding' is 239.
So, when independent column 'Funding' Nan values are droped, which affects dependent column 'Funds_in_million' also affects that count is 239 same as column 'Funding'.
Hence there is no NULL hypothesis since since variables of a dependent attribute can be affected.
C. Test for significance and conclusion [1 Mark]
1 - There is a significant difference in Funds raised between companies that are still opened and companies that are being closed down.(i.e): Operating companies have raised 6080.8372 millions and companies that have been closed raised just 185.7157 millions.
2 - Columns with Null values can affect the Dependent variable. Which means if we drop the columns that have Null value, the total count (frequence) of millions in 'Funds_in_million' will be decreased from 241 to 239.
D. Make a copy of the original data frame. [1 Mark]
company_df_copy = pd.read_csv('CompanyX_EU.csv')
company_df_copy
| Startup | Product | Funding | Event | Result | OperatingState | |
|---|---|---|---|---|---|---|
| 0 | 2600Hz | 2600hz.com | NaN | Disrupt SF 2013 | Contestant | Operating |
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed |
| 2 | 3DPrinterOS | 3dprinteros.com | NaN | Disrupt SF 2016 | Contestant | Operating |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating |
| 4 | 42 Technologies | 42technologies.com | NaN | Disrupt NYC 2013 | Contestant | Operating |
| ... | ... | ... | ... | ... | ... | ... |
| 657 | Zivity | zivity.com | $8M | TC40 2007 | Contestant | Operating |
| 658 | Zmorph | zmorph3d.com | $1M | - | Audience choice | Operating |
| 659 | Zocdoc | zocdoc.com | $223M | TC40 2007 | Contestant | Operating |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating |
662 rows × 6 columns
E. Check frequency distribution of Result variables. [1 Mark]
plt.hist(company_df['Result'])
#plt.xlabel('Frequency distribution of Result')
(array([312., 0., 25., 0., 0., 65., 0., 25., 0., 19.]), array([0. , 0.4, 0.8, 1.2, 1.6, 2. , 2.4, 2.8, 3.2, 3.6, 4. ]), <BarContainer object of 10 artists>)
F. Calculate percentage of winners that are still operating and percentage of contestants that are still operating [1 Mark]
describe_result = company_df.groupby(['Result','OperatingState']).describe()
describe_result
| Funds_in_million | |||||||||
|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | ||
| Result | OperatingState | ||||||||
| Audience choice | Closed | 2.0 | 1.595000 | 1.986970 | 0.1900 | 0.89250 | 1.5950 | 2.2975 | 3.0 |
| Operating | 23.0 | 21.538013 | 48.480902 | 0.0200 | 1.00000 | 2.1000 | 4.7000 | 168.8 | |
| Contestant | Acquired | 45.0 | 9.752911 | 15.903357 | 0.0200 | 1.00000 | 3.0000 | 12.0000 | 69.0 |
| Closed | 50.0 | 3.140514 | 5.765642 | 0.0930 | 0.38125 | 0.9085 | 3.5000 | 35.5 | |
| Ipo | 3.0 | 161.700000 | 153.531658 | 34.9000 | 76.35000 | 117.8000 | 225.1000 | 332.4 | |
| Operating | 214.0 | 17.245804 | 119.197322 | 0.0050 | 0.59250 | 1.8500 | 5.7000 | 1700.0 | |
| Finalist | Acquired | 13.0 | 15.111538 | 14.897681 | 0.1500 | 6.30000 | 10.3000 | 17.4000 | 44.7 |
| Closed | 5.0 | 5.100000 | 8.785784 | 0.7000 | 1.00000 | 1.2000 | 1.8000 | 20.8 | |
| Ipo | 1.0 | 66.000000 | NaN | 66.0000 | 66.00000 | 66.0000 | 66.0000 | 66.0 | |
| Operating | 46.0 | 25.997226 | 89.806214 | 0.0500 | 1.10000 | 4.6500 | 16.0750 | 583.6 | |
| Runner up | Acquired | 1.0 | 16.500000 | NaN | 16.5000 | 16.50000 | 16.5000 | 16.5000 | 16.5 |
| Operating | 18.0 | 19.288150 | 41.643980 | 0.0857 | 1.52500 | 7.2500 | 21.1250 | 182.1 | |
| Winner | Acquired | 7.0 | 31.460000 | 49.726584 | 0.5200 | 8.70000 | 10.5000 | 24.9000 | 142.0 |
| Operating | 18.0 | 19.544544 | 25.365468 | 0.0550 | 1.70000 | 11.5500 | 32.9500 | 103.0 | |
# list(company_df.Result.value_counts())
total_count = company_df.Result.value_counts()
total_count
Contestant 312 Finalist 65 Audience choice 25 Winner 25 Runner up 19 Name: Result, dtype: int64
total_count_sum = total_count.sum()
total_count_sum
446
winner_operating_count = describe_result.iloc[13, 0]
winner_operating_count
18.0
# Winner and Operating count is 18
winner_operating_count_percenatge = (winner_operating_count / total_count_sum) * 100
winner_operating_count_percenatge
4.0358744394618835
contestant_operating_count = describe_result.iloc[5, 0]
contestant_operating_count
214.0
# Contestant and Operating count is 214
contestant_operating_count_percenatge = (contestant_operating_count / total_count_sum) * 100
contestant_operating_count_percenatge
47.98206278026906
print('The percentage of winners that are still operating is %1.4f' % winner_operating_count_percenatge, '%')
print('The percentage of contestants that are still operating is %1.4f' % contestant_operating_count_percenatge, '%')
The percentage of winners that are still operating is 4.0359 % The percentage of contestants that are still operating is 47.9821 %
G. Write your hypothesis comparing the proportion of companies that are operating between winners and contestants: [2 Mark]
The winners that are still operating count is 4% and Constestants that are still operating count is 48%.
So, the companies that are operating have winners are lesser than the companies that have contestants.
H. Test for significance and conclusion [1 Mark]
Since there is lot of significance difference between winners and contestants that are still operating as the percentage of winners is 4% and contestants are 48%.
I. Select only the Event that has ‘disrupt’ keyword from 2013 onwards. [1 Mark]
disrupt_result = company_df[company_df['Event'].str.contains('Disrupt', case=False)]
disrupt_result
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | |
|---|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed | 0.63 |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating | 1.00 |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating | 29.00 |
| 13 | Agrilyst | agrilyst.com | $1M | Disrupt SF 2015 | Winner | Operating | 1.00 |
| 14 | Aiden | aiden.ai | $750K | Disrupt London 2016 | Contestant | Operating | 0.75 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 650 | YourMechanic | yourmechanic.com | $28M | Disrupt SF 2012 | Winner | Operating | 28.00 |
| 654 | ZEFR | zefr.com | $62.1M | Disrupt NYC 2010 | Contestant | Operating | 62.10 |
| 656 | Zenefits | zenefits.com | $583.6M | Disrupt NYC 2013 | Finalist | Operating | 583.60 |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating | 3.40 |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating | 31.50 |
318 rows × 7 columns
# Creating new column 'Disrupt_year' which holds the year
disrupt_result.loc[:,'Disrupt_year'] = disrupt_result['Event'].apply(lambda x: float(x[-4:]) if x[-1] == 'K' else (float(x[-4:]) if x[-1] == 'B' else float(x[-4:])))
disrupt_result
C:\Users\Bhavya Govindrao\AppData\Local\Temp\ipykernel_8420\1084268816.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | Disrupt_year | |
|---|---|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed | 0.63 | 2013.0 |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating | 1.00 | 2016.0 |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating | 29.00 | 2011.0 |
| 13 | Agrilyst | agrilyst.com | $1M | Disrupt SF 2015 | Winner | Operating | 1.00 | 2015.0 |
| 14 | Aiden | aiden.ai | $750K | Disrupt London 2016 | Contestant | Operating | 0.75 | 2016.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 650 | YourMechanic | yourmechanic.com | $28M | Disrupt SF 2012 | Winner | Operating | 28.00 | 2012.0 |
| 654 | ZEFR | zefr.com | $62.1M | Disrupt NYC 2010 | Contestant | Operating | 62.10 | 2010.0 |
| 656 | Zenefits | zenefits.com | $583.6M | Disrupt NYC 2013 | Finalist | Operating | 583.60 | 2013.0 |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating | 3.40 | 2013.0 |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating | 31.50 | 2012.0 |
318 rows × 8 columns
# Filter and display Disrupt year less then 2013
disrupt_result[disrupt_result['Disrupt_year'] < 2013]
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | Disrupt_year | |
|---|---|---|---|---|---|---|---|---|
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating | 29.0 | 2011.0 |
| 22 | Alicanto | alicanto.com | $4M | Disrupt SF 2012 | Contestant | Operating | 4.0 | 2012.0 |
| 27 | Amen. | getamen.com | $3M | Disrupt SF 2011 | Contestant | Acquired | 3.0 | 2011.0 |
| 34 | Appbistro | appbistro.com | $600K | Disrupt NYC 2010 | Contestant | Closed | 0.6 | 2010.0 |
| 35 | AppChina | appchina.com | $7.1M | Disrupt Beijing 2011 | Contestant | Operating | 7.1 | 2011.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 632 | WeDemand | wedemand.com | $1M | Disrupt SF 2012 | Contestant | Operating | 1.0 | 2012.0 |
| 649 | YouNow | younow.com | $26M | Disrupt SF 2011 | Contestant | Operating | 26.0 | 2011.0 |
| 650 | YourMechanic | yourmechanic.com | $28M | Disrupt SF 2012 | Winner | Operating | 28.0 | 2012.0 |
| 654 | ZEFR | zefr.com | $62.1M | Disrupt NYC 2010 | Contestant | Operating | 62.1 | 2010.0 |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating | 31.5 | 2012.0 |
125 rows × 8 columns
# Filter and display Disrupt year greater then 2013
disrupt_result[disrupt_result['Disrupt_year'] > 2013]
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | Disrupt_year | |
|---|---|---|---|---|---|---|---|---|
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating | 1.000 | 2016.0 |
| 13 | Agrilyst | agrilyst.com | $1M | Disrupt SF 2015 | Winner | Operating | 1.000 | 2015.0 |
| 14 | Aiden | aiden.ai | $750K | Disrupt London 2016 | Contestant | Operating | 0.750 | 2016.0 |
| 16 | Aircall | aircall.io | $11.6M | Disrupt SF 2015 | Contestant | Operating | 11.600 | 2015.0 |
| 18 | AirHelp | airhelp.com | $12.2M | Disrupt NYC 2014 | Contestant | Operating | 12.200 | 2014.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 633 | Wellth | wellthapp.com | $2.5M | Disrupt NY 2015 | Contestant | Operating | 2.500 | 2015.0 |
| 634 | Wellthy | wellthy.com | $2M | Disrupt SF 2015 | Contestant | Operating | 2.000 | 2015.0 |
| 635 | Welltwigs | welltwigs.com | $50K | Disrupt NY 2015 | Contestant | Operating | 0.050 | 2015.0 |
| 642 | Xendo | xendo.com | $28K | Disrupt SF 2014 | Contestant | Acquired | 0.028 | 2014.0 |
| 646 | YayPay Inc | yaypay.com | $900K | Disrupt London 2015 | Contestant | Operating | 0.900 | 2015.0 |
140 rows × 8 columns
# Filter and display Disrupt year greater then and equal to 2013
disrupt_result[disrupt_result['Disrupt_year'] >= 2013]
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | Disrupt_year | |
|---|---|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed | 0.630 | 2013.0 |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating | 1.000 | 2016.0 |
| 13 | Agrilyst | agrilyst.com | $1M | Disrupt SF 2015 | Winner | Operating | 1.000 | 2015.0 |
| 14 | Aiden | aiden.ai | $750K | Disrupt London 2016 | Contestant | Operating | 0.750 | 2016.0 |
| 16 | Aircall | aircall.io | $11.6M | Disrupt SF 2015 | Contestant | Operating | 11.600 | 2015.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 641 | Workspot | workspot.com | $15.8M | Disrupt NYC 2013 | Contestant | Operating | 15.800 | 2013.0 |
| 642 | Xendo | xendo.com | $28K | Disrupt SF 2014 | Contestant | Acquired | 0.028 | 2014.0 |
| 646 | YayPay Inc | yaypay.com | $900K | Disrupt London 2015 | Contestant | Operating | 0.900 | 2015.0 |
| 656 | Zenefits | zenefits.com | $583.6M | Disrupt NYC 2013 | Finalist | Operating | 583.600 | 2013.0 |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating | 3.400 | 2013.0 |
193 rows × 8 columns
# Filter and display Disrupt year less then and equal to 2013
disrupt_result[disrupt_result['Disrupt_year'] <= 2013]
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | Disrupt_year | |
|---|---|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed | 0.63 | 2013.0 |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating | 29.00 | 2011.0 |
| 22 | Alicanto | alicanto.com | $4M | Disrupt SF 2012 | Contestant | Operating | 4.00 | 2012.0 |
| 27 | Amen. | getamen.com | $3M | Disrupt SF 2011 | Contestant | Acquired | 3.00 | 2011.0 |
| 34 | Appbistro | appbistro.com | $600K | Disrupt NYC 2010 | Contestant | Closed | 0.60 | 2010.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 650 | YourMechanic | yourmechanic.com | $28M | Disrupt SF 2012 | Winner | Operating | 28.00 | 2012.0 |
| 654 | ZEFR | zefr.com | $62.1M | Disrupt NYC 2010 | Contestant | Operating | 62.10 | 2010.0 |
| 656 | Zenefits | zenefits.com | $583.6M | Disrupt NYC 2013 | Finalist | Operating | 583.60 | 2013.0 |
| 660 | Zula | zulaapp.com | $3.4M | Disrupt SF 2013 | Audience choice | Operating | 3.40 | 2013.0 |
| 661 | Zumper | zumper.com | $31.5M | Disrupt SF 2012 | Finalist | Operating | 31.50 | 2012.0 |
178 rows × 8 columns